library(dplyr)
library(tidyverse)
library(readxl)
# setwd("D:PhD/01_Data/03_Vitro/04_Colonization difference between WT and KO") # Windows
# setwd("/Volumes/Yiming_Wang/PhD/01_Data/03_Vitro/04_Colonization difference between WT and KO") # Mac
setwd("/Users/godric/Desktop/Fut2 review comments/Revision") #Macbook pro
df.all <- read_excel("Data for AUC.xlsx")
# First get rid of negative data
# Then gather data to have long table
# Then separate type to get hr and type
df.1<-df.all%>%
filter(Genotype!="Negative")%>%
gather(type,value,-c(No,ID,Genotype,Sex,GenotypeSex,Cage))%>% # convert the original table to long table, create two new columns and put them after -c()
separate(type,c("hr","Treatment"),remove=T) # Seprate type into two columns, e.g. "24hr_NG", separate function can separate the item which has non-alphabet/number symbol into two parts
df.2 <- df.1 %>%
arrange(ID,hr) %>%
group_by(ID,hr) %>%
mutate(OD_difference = value - lag(value)) %>%
filter(!is.na(OD_difference)) %>%
select(ID, Genotype, Sex, GenotypeSex, hr, OD_difference)
summary <- df.2 %>%
group_by(Genotype,hr) %>%
summarise(value.mean=mean(OD_difference), sd=sd(OD_difference))%>%
ungroup()
# min line is for ribbon in figure
min.line<-summary%>%
select(-sd)%>%
mutate(hr = as.numeric(gsub("hr","",gsub("X","",hr)))) %>%
spread(Genotype,value.mean)%>%
group_by(hr)%>%
mutate(min_line=min(WT,KO))%>%
select(hr,min_line)
summary.final<-summary %>%
mutate(hr.new = as.numeric(gsub("hr","",gsub("X","",hr))))%>%
left_join(min.line,by=c("hr.new"="hr"))
# Gather negative control data
neg.ctrl<-df.all %>%
filter(Genotype == "Negative")%>%
gather(type,value,-c(No,ID,Genotype,Sex,GenotypeSex,Cage))%>%
separate(type,c("hr","Treatment"),remove=T)%>%
group_by(ID,hr) %>%
mutate(OD_difference = value - lag(value)) %>%
filter(!is.na(OD_difference)) %>%
select(ID, Genotype, Sex, GenotypeSex, hr, OD_difference) %>%
group_by(Genotype,hr) %>%
summarise(value.mean=mean(OD_difference),sd=sd(OD_difference))%>% #Summarize mean and standard deviation for future figure use
ungroup()%>%
mutate(hr.new=as.numeric(gsub("hr","",gsub("X","",hr))))%>%
mutate(min_line=0)
# select(Genotype,hr,Treatment,value.mean)%>%
# rename(Genotype=ID)%>%
# mutate(sd=0)#The negative control is under ID column, to bind negative control and my other data, the ID column should be renamed to Genotype
#bind data
summary.final2<-rbind(summary.final,neg.ctrl)
#
# summary.final2<-summary.final2%>%
# mutate(group=ifelse(group=="G KO","KO with 2'-FL",
# ifelse(group=="NG KO","KO without glycan",
# ifelse(group=="G WT", "WT with 2'-FL",
# ifelse(group=="NG WT", "WT without glycan",
# ifelse(group=="G Negative", "Negative control with 2'-FL",
# "Negative control without glycan")))))) %>%
# mutate(group=factor(group,levels=c("KO with 2'-FL","KO without glycan",
# "WT with 2'-FL", "WT without glycan",
# "Negative control with 2'-FL", "Negative control without glycan")))# Factor group
calculus<-summary.final %>%
filter(Genotype!="Negative") %>%
group_by(Genotype) %>%
arrange(Genotype,hr.new)%>% #Sort your data based on ID,Treatment,hr.new
mutate(sum.value=value.mean+lag(value.mean), #sum.value is the summary of the two base value
hr.interval=hr.new-lag(hr.new))%>% # hr.interval is the height of the trapezoid
mutate(area=sum.value*hr.interval/2)%>% #this area is the total area of the highest curve
# summarise(area.total=sum(area,na.rm = T))%>% # Summarize different timepoint area
ungroup() %>%
group_by(Genotype)%>%
summarise(total.area=sum(area,na.rm=T))%>%
ungroup()%>%
mutate(area.difference=total.area-lag(total.area))%>%
ungroup()%>%
filter(!is.na(area.difference))%>%
select(area.difference)#You can't rejoin the dataset because this is summarised and no id is there.
Summary Line Charts
library(ggplot2)
library(ggthemes)
library (gridExtra)
summary.final2 <- summary.final2 %>%
mutate(Genotype=ifelse(Genotype == "WT", "WT",
ifelse(Genotype == "KO", "KO", "Media control")))
level_order_genotype <- c("WT","KO", "Media control")
# WT vs KO
figure <- ggplot(data=subset(summary.final2), aes(x=hr.new,y=value.mean,group=factor(Genotype, level=level_order_genotype)))+
geom_line(size=1,aes(color=factor(Genotype, level=level_order_genotype))) +
geom_point(size=2,aes(color=factor(Genotype, level=level_order_genotype),shape=factor(Genotype, level=level_order_genotype))) +
geom_pointrange(aes(ymin=value.mean-sd, ymax=value.mean+sd,color=factor(Genotype, level=level_order_genotype))) + # error bar method 1 (just line)
theme_bw() +
labs(x="Time (hr)",y=expression(OD[600]~ difference),caption = "",
title = "")+ # Faeces collected from KO mice, you can use "title = paste0(k,"+ Negative")
theme(legend.title = element_blank(),
legend.position = "bottom",
legend.text = element_text(size=12),
axis.text.x=element_text(colour="black", face="plain", size=13),
axis.text.y=element_text(colour="black", face="plain", size=13),
axis.title.x=element_text(margin = margin(t = 5), size=13),
axis.title.y=element_text(margin = margin(r = 5), size=13),
axis.title = element_text(face="plain", size = 11),
plot.title = element_text(size=10, hjust=0.5),
panel.grid = element_blank(),
panel.border = element_rect(color = "black",
fill = NA,
size = 1),
aspect.ratio = 1,
plot.background = element_rect(fill = "transparent", colour = "transparent"))+
scale_y_continuous(limits = c(-0.01,0.5),breaks = seq(0,0.5,0.1))+
scale_x_continuous(limits=c(0,120),breaks=seq(0,120,24))+
scale_colour_manual(values=c("#3C5488B2","#7E6148B2","#3B3B3BB2")) +
geom_ribbon(aes(ymax=value.mean, ymin=min_line),fill="#cfcfcf",alpha = 0.2)+ # ivory3, seashell3, gray66, #87c0e6, #ffa0aa, #808080
annotate("text",x=60,y=0.5,
label=expression(italic("P") == "0.0051"),hjust=0.5, size=4.5)
figure

Calculate area of each sample
neg.ctrl2<-neg.ctrl%>%
rename(ID=1)%>% # Rename the first column name as ID
mutate(hr.new = as.numeric(gsub("hr","",gsub("X","",hr)))) # Create a new column with min.line=0
neg.ctrl2 = subset(neg.ctrl2, select=-c(hr, sd)) %>%
rename (hr=hr.new) %>%
rename (value=value.mean) %>%
mutate (Genotype = "Negative") %>%
select(ID, hr, Genotype, value)
data.individual.final<-df.1 %>%
select(ID,hr,Genotype, Treatment,value) %>%
spread(Treatment,value) %>% # Transform the table to normal table (wide)
mutate(hr = as.numeric(gsub("hr","",gsub("X","",hr))))%>%
group_by(ID,hr)%>%
mutate(value = G-NG) %>%
select(ID, hr, Genotype, value) %>%
rbind(neg.ctrl2)
# min.line<-summary%>%
# select(-sd)%>%
# mutate(hr = as.numeric(gsub("hr","",gsub("X","",hr)))) %>%
# spread(Genotype,value.mean)%>%
# group_by(hr)%>%
# mutate(min_line=min(WT,KO))%>%
# select(hr,min_line)
calculus<-data.individual.final %>%
filter(ID!="Negative") %>%
group_by(ID,Genotype) %>%
arrange(ID,Genotype,hr)%>% #Sort your data based on ID,Treatment,hr.new
mutate(sum.value=value+lag(value), #sum.value is the summary of the two base value
hr.interval=hr-lag(hr))%>% # hr.interval is the height of the trapezoid
mutate(area=sum.value*hr.interval/2)%>% #this area is the total area of the highest curve
summarise(area.total=sum(area,na.rm = T))%>% # Summarize different timepoint area
ungroup()
Statistical analysis
Normality test
set.seed(123)
calculus_WT <- calculus %>%
filter(Genotype == "WT")
calculus_KO <- calculus %>%
filter(Genotype == "KO")
# Choosing a normality test
## Answer: D'Agostino-Pearson normality test ("omnibus K2" test)
## Reasons: It first computes the skewness and kurtosis to quantify how far the distribution is from Gaussian in terms of asymmetry and shape. It then calculates how far each of these values differs from the value expected with a Gaussian distribution, and computes a single P value from the sum of these discrepancies. It is a versatile and powerful normality test. (https://www.graphpad.com/guides/prism/latest/statistics/stat_choosing_a_normality_test.htm)
# D'Agostino-Pearson normality test ("omnibus K2" test)
##Null (H0): The data are normally distributed Alternate/the data are sampled from a Gaussian distribution
##The low p-value (<0.05) indicates you reject that null hypothesis and so accept the alternative hypothesis that the data are not sampled from a Gaussian population
## https://www.rdocumentation.org/packages/PoweR/versions/1.0.7/topics/statcompute
# Example
library(PoweR)
getindex() # index 6 is D'Agostino-Pearson normality test ("omnibus K2" test)
## Index Law Nbparams Default1 Default2 Default3
## 1 1 Laplace(mu,b) 2 0.000000 1 NA
## 2 2 Normal(mu,sigma) 2 0.000000 1 NA
## 3 3 Cauchy(mu,sigma) 2 0.000000 1 NA
## 4 4 Logistic(mu,sigma) 2 0.000000 1 NA
## 5 5 Gamma(shape,rate) 2 2.000000 1 NA
## 6 6 Beta(a,b) 2 1.000000 1 NA
## 7 7 Uniform(a,b) 2 0.000000 1 NA
## 8 8 Student-t(df) 1 1.000000 NA NA
## 9 9 Chi-squared(df) 1 1.000000 NA NA
## 10 10 Lognormal(logmean,logsd) 2 0.000000 1 NA
## 11 11 Weibull(shape,scale) 2 1.000000 1 NA
## 12 12 ShiftedExp(l,rate) 2 0.000000 1 NA
## 13 13 U^{j+1} 1 1.000000 NA NA
## 14 14 AveUnif(k,a,b) 3 2.000000 0 1.0
## 15 15 UUnif(j) 1 1.000000 NA NA
## 16 16 VUnif(j) 1 1.000000 NA NA
## 17 17 JSU(mu,sigma,nu,tau) 4 0.000000 1 0.0
## 18 18 Tukey(l) 1 1.000000 NA NA
## 19 19 LoConN(p,m) 2 0.200000 3 NA
## 20 20 JSB(g,d) 2 0.000000 1 NA
## 21 21 SkewN(xi,omega,alpha) 3 0.000000 1 0.0
## 22 22 ScConN(p,d) 2 0.200000 2 NA
## 23 23 GP(mu,sigma,xi) 3 0.000000 1 0.0
## 24 24 GED(mu,sigma,p) 3 0.000000 1 1.0
## 25 25 Stable(alpha,beta,c,mu) 4 1.000000 0 1.0
## 26 26 Gumbel(mu,sigma) 2 1.000000 1 NA
## 27 27 Frechet(mu,sigma,alpha) 3 0.000000 1 1.0
## 28 28 GEV(mu,sigma,xi) 3 0.000000 1 0.0
## 29 29 GArcSine(alpha) 1 0.500000 NA NA
## 30 30 FoldN(mu,sigma) 2 0.000000 1 NA
## 31 31 MixN(p,m,d) 3 0.500000 0 1.0
## 32 32 TruncN(a,b) 2 0.000000 1 NA
## 33 33 Nout(a) 1 1.000000 NA NA
## 34 34 GEP(t1,t2,t3,crit) 4 0.500000 0 1.0
## 35 35 Exponential(lambda) 1 1.000000 NA NA
## 36 36 ALaplace(mu,b,k) 3 0.000000 1 2.0
## 37 37 NIG(alpha,beta,delta,mu) 4 1.000000 0 1.0
## 38 38 APD(theta,phi,alpha,lambda) 4 0.000000 1 0.5
## 39 39 modAPD(mu,sigma,theta1,theta2) 4 0.000000 1 0.5
## 40 40 LPtn(alpha,mu,sigma) 3 1.959964 0 1.0
## Default4
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## 7 NA
## 8 NA
## 9 NA
## 10 NA
## 11 NA
## 12 NA
## 13 NA
## 14 NA
## 15 NA
## 16 NA
## 17 5e-01
## 18 NA
## 19 NA
## 20 NA
## 21 NA
## 22 NA
## 23 NA
## 24 NA
## 25 0e+00
## 26 NA
## 27 NA
## 28 NA
## 29 NA
## 30 NA
## 31 NA
## 32 NA
## 33 NA
## 34 1e-06
## 35 NA
## 36 NA
## 37 0e+00
## 38 2e+00
## 39 2e+00
## 40 NA
## Index Stat Alter Nbparams
## 1 1 K-S 3 0
## 2 2 AD^* 3 0
## 3 3 Z_C 3 0
## 4 4 Z_A 3 0
## 5 5 P_S 3 0
## 6 6 K^2 3 0
## 7 7 JB 3 0
## 8 8 DH 3 0
## 9 9 RJB 3 0
## 10 10 T_{Lmom} 3 0
## 11 11 T_{Lmom}^{(1)} 3 0
## 12 12 T_{Lmom}^{(2)} 3 0
## 13 13 T_{Lmom}^{(3)} 3 0
## 14 14 BM_{3-4} 3 0
## 15 15 BM_{3-6} 3 0
## 16 16 T_{MC-LR} 3 0
## 17 17 T_w 0,1,2 0
## 18 18 T_{MC-LR}-T_w 3 0
## 19 19 T_{S,5} 3 0
## 20 20 T_{K,5} 3 0
## 21 21 W 4 0
## 22 22 W' 4 0
## 23 23 tilde{W} 4 0
## 24 24 D 0,1,2 0
## 25 25 r 4 0
## 26 26 CS 3 0
## 27 27 Q 0,1,2 0
## 28 28 Q-Q* 0,1,2 0
## 29 29 BCMR 3 0
## 30 30 beta_3^2 3 0
## 31 31 T^*(alpha) 1 1
## 32 32 I_n 3 0
## 33 33 R_{sJ} 3 0
## 34 34 Q* 0,1,2 0
## 35 35 R_n 3 0
## 36 36 X_{APD} 3 0
## 37 37 Z_{EPD} 0,1,2 0
## 38 38 GLB 3 0
## 39 39 V_3-ML 0,1,2 0
## 40 40 V_4-ML 0,1,2 0
## 41 41 S 3 0
## 42 42 A^2 3 0
## 43 43 W^2 3 0
## 44 44 U^2 3 0
## 45 45 sqrt{n}D 3 0
## 46 46 V 3 0
## 47 47 T_{n,a}^{(1)}-MO 3 1
## 48 48 T_{n,a}^{(1)}-ML 3 1
## 49 49 T_{n,a}^{(2)}-MO 3 1
## 50 50 T_{n,a}^{(2)}-ML 3 1
## 51 51 T_{m,n}^{V} 4 1
## 52 52 T_{m,n}^{E} 4 1
## 53 53 T_{m,n}^{C} 4 1
## 54 54 hat{G}_n 3 0
## 55 55 V_3 0,1,2 0
## 56 56 V_4 0,1,2 0
## 57 57 K_1 3 0
## 58 58 T 3 0
## 59 59 Z 3 0
## 60 60 K 3 0
## 61 61 DLLap1 0,1,2 0
## 62 62 DLLap2 0,1,2 0
## 63 63 D_n 3 0
## 64 64 W_{n}^{2} 3 0
## 65 65 A_{n}^{2} 3 0
## 66 66 C_n 3 0
## 67 67 K_n 3 0
## 68 68 T_1 3 0
## 69 69 T_2 3 0
## 70 70 G(n) 3 0
## 71 71 Q 3 0
## 72 72 2nI^{lambda} 3 1
## 73 73 M(n) 3 0
## 74 74 L_{n}^{(m)} 4 1
## 75 75 S_{n}^{(m)} 3 1
## 76 76 H(m,n) 4 1
## 77 77 A^{*}(n) 3 0
## 78 78 D_{n,m}(phi_lambda) 3 2
## 79 79 E_{m,n} 3 1
## 80 80 T_{n,m}^{lambda} 3 2
## 81 81 Z_A 3 0
## 82 82 Z_C 3 0
## 83 83 t test 0,1,2 1
## 84 85 DLLap3 3 0
## 85 86 T(alpha_1,alpha_2) 3 1
## 86 87 T_n 3 0
## 87 88 T^{LS}(alpha) 3 1
## 88 89 T^{LS}_3(mu,alpha) 3 2
## 89 90 T^{LS}_4(alpha,mu) 3 2
## 90 91 GV_1 0,1,2 0
## 91 92 GV_2 0,1,2 0
## 92 93 Ho_K 0,1,2 0
## 93 94 Ho_U 0,1,2 0
## 94 95 Ho_V 0,1,2 0
## 95 96 Ho_W 0,1,2 0
## 96 97 SR^* 0,1,2 0
statcompute(6,calculus_WT$`area.total`, levels = c(0.05), critvalL = NULL, critvalR = NULL,
alter = 0, stat.pars = NULL)
## $statistic
## [1] 3.274137
##
## $pvalue
## [1] 0
##
## $decision
## [1] 1
##
## $alter
## [1] 3
##
## $stat.pars
## [1] NA
##
## $symbol
## [1] "K^2"
statcompute(6,calculus_KO$`area.total`, levels = c(0.05), critvalL = NULL, critvalR = NULL,
alter = 0, stat.pars = NULL)
## $statistic
## [1] 10.65753
##
## $pvalue
## [1] 0
##
## $decision
## [1] 1
##
## $alter
## [1] 3
##
## $stat.pars
## [1] NA
##
## $symbol
## [1] "K^2"
Unpaired unparametric test: Mann-whitney test
set.seed(123)
# Method 1_Base R
## Summary
Summary.area <- calculus %>%
group_by(Genotype) %>%
summarise(
count = n(),
median= median (area.total, na.rm=TRUE),
IQR = IQR (area.total,na.rm=TRUE),
IQR25 = quantile(area.total, 0.25),
IQR75 = quantile(area.total, 0.75)
)
Summary.area
## # A tibble: 2 × 6
## Genotype count median IQR IQR25 IQR75
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 KO 32 16.7 6.57 13.1 19.7
## 2 WT 32 20.5 6.91 16.6 23.5
## Area difference between WT and KO
stats <- wilcox.test(area.total ~ Genotype, data=calculus,
paired=FALSE, correct=TRUE, conf.int=TRUE, conf.level=0.95, exact=FALSE)
stats
##
## Wilcoxon rank sum test with continuity correction
##
## data: area.total by Genotype
## W = 303, p-value = 0.005117
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -6.671942 -1.242029
## sample estimates:
## difference in location
## -3.749503
# Effect size
library (rstatix)
set.seed(123)
effectsize <- calculus %>% wilcox_effsize(area.total ~ Genotype)
effectsize
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 area.total KO WT 0.351 32 32 moderate
# Method 2_ggpubr_to add it in the figure
## P value
library(ggpubr)
set.seed(123)
compare_means(area.total ~ Genotype, data=calculus,method = "wilcox.test", paired=FALSE, group.by = NULL, ref.group=NULL)
## # A tibble: 1 × 8
## .y. group1 group2 p p.adj p.format p.signif method
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 area.total WT KO 0.00512 0.0051 0.0051 ** Wilcoxon
level_order_genotype <- c("WT","KO")
# Visualization
# install.packages("hrbrthemes")
# library(hrbrthemes)
library(viridis)
figure.area <- ggplot(data=calculus, mapping = aes(y=area.total, x=factor(Genotype,level=level_order_genotype)))+
geom_boxplot(outlier.shape = NA)+
theme_bw()+
geom_jitter(aes(colour=Genotype),
size=2, alpha=0.9,
position=position_jitterdodge(jitter.width = 0.5,
jitter.height = 0,
dodge.width = 0.8)) +
labs(x="",y=" ",caption = "",
title = "")+
theme(legend.position="none",
axis.text.x=element_text(colour="black", face="plain", size=14),
axis.text.y=element_text(colour="black", face="plain", size=14),
axis.title.x=element_text(margin = margin(t = 5)),
axis.title.y=element_text(margin = margin(r = 5), size = 16),
axis.title = element_text(face="plain", size = 10),
plot.title = element_text(size=10, hjust = 0.5),
panel.grid = element_blank(),
panel.border = element_rect(color = "black",
fill = NA,
size = 1),
aspect.ratio = 1,
plot.background = element_rect(fill = "transparent", colour = "transparent"))+
scale_y_continuous(limits = c(-5,35),breaks = seq(-5,35,10))+
scale_color_manual(values=c("#7E6148B2","#3C5488B2"))
figure.area

#
# annotate("text",x=0,y=0.6,
# label=paste0("Total Area = ",round(as.vector(subset(calculus,Genotype==k,select=area.difference)),2)),
# hjust=0,size=3)
#
figure.area.final <- figure.area +
annotate("text",x=1.515,y=35,
label="p = 0.0051", hjust = 0.5, size=4.5)
figure.area.final
